CAS Logo Open main paper 🔗

2  Estimating the five fairness premiums

Objectives

Building on the simulations in Chapter 1, this section estimates the spectrum of fairness. It complements Section 4.3 of the main paper, linked from the supplement’s introduction and in the left margin of the current interface. We recommend keeping the main paper nearby, as this supplement focuses exclusively on implementation.

Packages for this section
library(tidyverse)
library(latex2exp)
library(jsonlite)

## also setup python for optimal transport. 
library(reticulate)
get_python_env_path <- function(file_path = "python_env_path.txt") {
  if (!file.exists(file_path)) {
    stop("The file 'python_env_path.txt' is missing. Please create it and specify your Python environment path.")
  }
  
  # Read the file and extract the first non-comment, non-empty line
  env_path <- readLines(file_path, warn = FALSE)
  env_path <- env_path[!grepl("^#", env_path) & nzchar(env_path)]
  
  if (length(env_path) == 0) {
    stop("No valid Python environment path found in 'python_env_path.txt'. Please enter your path.")
  }
  
  return(trimws(env_path[1]))
}
tryCatch({
  python_env_path <- get_python_env_path()
  use_python(python_env_path, required = TRUE)
  message("Python environment successfully set to: ", python_env_path)
}, error = function(e) {
  stop(e$message)
})
Data for this section
sims <- fromJSON('simuls/train_scenarios.json')
valid <-  fromJSON('simuls/valid_scenarios.json')
test <- fromJSON('simuls/test_scenarios.json')

## For the experiment in section 5
sim_samples <- jsonlite::fromJSON('simuls/sim_study.json')
Functions and objects from past sections
levels_for_premiums <- c("mu_B", "mu_U", "mu_A", "mu_H", "mu_C")
labels_for_premiums <- c("$\\widehat{\\mu}^B$","$\\widehat{\\mu}^U$", "$\\widehat{\\mu}^A$", "$\\widehat{\\mu}^H$", "$\\widehat{\\mu}^C$")

## four ingredients for the 5 families of premiums
premium_generator <- function(best, pdx, maps_to_corr, marg){
  list("mu_B" = function(x1, x2, d){
    
    ## simple call of the best estimate model
    best(data.frame(X1 = x1,
                    X2 = x2,
                    D = d))
  }, "mu_U" = function(x1, x2, d = NULL){
    
    ## Explicit inference : mu_U = E(mu_B|X)
    tab_best <- sapply(0:1, function(dl){
      best(data.frame(X1 = x1,
                    X2 = x2,
                    D = rep(dl, length(x1)))) 
      }) 
    tab_pdx <- sapply(0:1, function(dl){
      pdx(data.frame(X1 = x1,
                    X2 = x2,
                    D = rep(dl, length(x1))))
      })
    
    (tab_best * tab_pdx) %>% apply(1, sum)
  }, "mu_A" = function(x1, x2, d = NULL){
    
    ## mu_A = E(mu_B) unconditional
    sapply(0:1, function(d){best(data.frame(X1 = x1,
                    X2 = x2,
                    D = rep(d, length(x1))))*
       marg[d + 1]}) %>% apply(1, sum)
  }, "mu_H" = function(x1, x2, d = NULL){
    
    ## explicit inference of mu_C : mu_H = E(mu_C|X)
    tab_corr <- sapply(0:1, function(dl){
      sb <- best(data.frame(X1 = x1,
                    X2 = x2,
                    D = rep(dl, length(x1))))
      maps_to_corr(data.frame(mu_B = sb, D = dl))
      }) 
    tab_pdx <- sapply(0:1, function(dl){
      pdx(data.frame(X1 = x1,
                    X2 = x2,
                    D = rep(dl, length(x1))))
      })
    
    (tab_corr * tab_pdx) %>% apply(1, sum)
  }, "mu_C" = function(x1, x2, d = NULL){
    
    ## mu_C = T^{d}(mu_B(x, d))
    mu_b <- best(data.frame(X1 = x1,
                    X2 = x2,
                    D = d))
    maps_to_corr(data.frame(mu_B = mu_b, D = d))
  })
}

levels_for_quants <- c('proxy_vuln', 'risk_spread', 'fair_range', 'parity_cost')


quant_generator <- function(premiums){
  list('proxy_vuln' = function(x1, x2, d){
    premiums$mu_U(x1 = x1, x2 = x2) - 
      premiums$mu_A(x1 = x1, x2 = x2)
  },
  'risk_spread' = function(x1, x2, x3, d){
    to_minmax <- data.frame(risk1 = premiums$mu_B(x1 = x1,
                                              x2 = x2, 
                                              d = 1),
                            risk0 = premiums$mu_B(x1 = x1,
                                              x2 = x2, 
                                              d = 0))
    apply(to_minmax, 1, max) - apply(to_minmax, 1, min) 
  },
  'fair_range' = function(x1, x2, d){
    to_minmax <- data.frame(mu_b = premiums$mu_B(x1 = x1, x2 = x2, 
                                           d = d),
                           mu_u = premiums$mu_U(x1 = x1, x2 = x2, 
                                          d = NULL),
                           mu_a = premiums$mu_A(x1 = x1, x2 = x2, 
                                          d = NULL),
                           mu_h = premiums$mu_H(x1 = x1, x2 = x2,
                                          d = NULL),
                           mu_c = premiums$mu_C(x1 = x1, x2 = x2, 
                                          d = d))
    
    apply(to_minmax, 1, max) - apply(to_minmax, 1, min) 
  },
  'parity_cost' = function(x1, x2, d){
    premiums$mu_C(x1 = x1, x2 = x2,
              d = d) -
      premiums$mu_B(x1 = x1, x2 = x2, 
                d = d)
  })
}

the_CAS_colors <- c('#FED35D', '#F89708', '#205ED5', '#142345')

In this section, we provide an example of detailed methodology for estimating the benchmarks premiums from the spectrum of fairness defined in Section 4.1 of the main paper. It complements the estimation of Section 4.1.

In this paper, premiums are estimated using lightgbm algorithms from Ke et al. (2017), a package that supports common actuarial distributions (Tweedie, Poisson, Gamma, etc.) and was found to have excellent predictive performance compared to other boosting algorithm in Chevalier and Côté (2024). Given lightgbm’s flexibility, we advocate for careful variable preselection to:

We optimize hyperparameters with a validation set to prevent overfitting. Key hyperparameters in lightgbm are the number of trees (num_iterations), regularization (lambda_l1, lambda_l2) for complexity control, and subsampling percentages (feature_fraction, bagging_fraction) .

The data used to construct prices and the estimated spectrum should align to ensure comparability. In the Case Study of the main paper, we conduct a posteriori benchmarking, as the study period is historical. A posteriori assessment can reveal segments where the model misaligned from fairness dimensions after the fact. In contrast, conducting benchmarking concurrently with the development of a new pricing model provides an estimate of how well the commercial price aligns with fairness dimensions for future periods, though the true fairness implications of the commercial price will only become evident retrospectively.

Discretizing \(D\)

Subsequent steps involve numerous computations over all possible values of \(D\). When \(D\) is continuous, discretization can improve tractability. The discretized version should be used throughout the estimation of the spectrum, and the discretization function should be kept for future applications. For multivariate \(D\), one approach is to first discretize continuous components, then treat the vector as one categorical variable where each unique combination defines a category. Care should be taken to ensure a manageable number of categories, with sufficient representation in each.

Balancing benchmark premiums

Profit and expense loadings are important, but they must not reintroduce unfairness. To align premium benchmarks with expected profits, we multiply the premiums by a factor \(\delta_j \geq 1~\forall j \in \{B, U, A, H, C\}\) to globally balance all premiums to the level of the commercial price.

In what follows, we detail how we estimate each benchmark premium in our framework.

2.1 Best-estimate premium

The best estimate premium \(\widehat{\mu}^B(\mathbf{x}, d)\) serves as the anchor. Thus, its estimation is crucial for our framework. It provides the best predictor of the response variable \(Y\) when differentiating risks based on \((X, D)\). It can be derived from indicated rates or data-driven (what we do) estimates as detailed in Complement 5 of the main paper.

Training the data-driven best-estimate price
source('___lgb_best_estimate.R')

## clean the pred repo
unlink(file.path('preds', "*_best_estimate.json"))

# Define grid for hyperparameter optimization
  hyperparameter_grid <- expand.grid(
    learning_rate = c(0.01),
    feature_fraction = c(0.75),
    bagging_fraction = c(0.75),
    max_depth = c(5),
    lambda_l1 = c(0.5),
    lambda_l2 = c(0.5)
  )

best_lgb <- setNames(nm = names(sims)) %>% lapply(function(name){
  list_df <- list('train' = sims[[name]],
                  'valid' = valid[[name]],
                  'test' = test[[name]])
  the_best_estimate_lightgbm_fun(list_data = list_df, 
                                 name = name,
                                 hyperparameter_grid = hyperparameter_grid)
})
Best for scenario:  Scenario1  
hyperparam  1 / 1 
Best valid mse: 52.28864  
optimal ntree: 512  
Training time: 27.02974  sec. 
Best for scenario:  Scenario2  
hyperparam  1 / 1 
Best valid mse: 51.98283  
optimal ntree: 604  
Training time: 30.19427  sec. 
Best for scenario:  Scenario3  
hyperparam  1 / 1 
Best valid mse: 53.0136  
optimal ntree: 500  
Training time: 25.4071  sec. 
Training the best-estimate price on experimental data (for later use in section 5)
# Define grid for hyperparameter optimization
hyperparameter_grid_sims <- expand.grid(
    learning_rate = c(0.01),
    bagging_fraction = c(0.75),
    max_depth = c(6),
    lambda_l1 = c(0.5),
    lambda_l2 = c(0.5)
  )

best_sims <- lapply(seq_along(sim_samples), function(idx){
    list_df <- list('train' = sim_samples[[idx]]$train,
                  'valid' = sim_samples[[idx]]$valid,
                  'test' = sim_samples[[idx]]$test)
  the_best_estimate_lightgbm_fun(list_data = list_df,
                                 name = paste0('sim', idx),
                                 hyperparameter_grid = hyperparameter_grid_sims)
})

2.2 Unaware premium

The unaware price, \(\mu^U(\mathbf{x})\), excludes direct use of \(D\). It is defined as the best predictor of \(\widehat{\mu}^B(\mathbf{X}, D)\) given \(\mathbf{x}\):
\[\begin{equation*} \widehat{\mu}^U(\mathbf{x}) = E\{\widehat{\mu}^B(\mathbf{x}, D) | X = \mathbf{x}\}. \end{equation*}\]
This maintains consistency with the best estimate premium, whether \(\widehat{\mu}^B\) is data-driven or based on indicated rates (Complement 5 of the main paper). One can use a to predict \(\widehat{\mu}^B(\mathbf{X}, D)\) based on \(\mathbf{X}\). The loss function defaults to mean squared error as it is a reasonable distributional assumption for this response variable.

Alternatively (what we do), one can estimate the propensity and explicitly weight the best-estimate premiums based on predicted propensity. This also keeps consistency with the best-estimate.

Estimating the propensity function
source('___lgb_pdx_estimate.R') 

## clean the preds repo
unlink(file.path('preds', "*_pdx.json"))

  hyperparameter_grid <- expand.grid(
    learning_rate = c(0.01),
    feature_fraction = c(0.75),
    bagging_fraction = c(0.75),
    max_depth = c(5),
    lambda_l1 = c(0.5),
    lambda_l2 = c(0.5)
  )

pdx_lgb <- setNames(nm = names(sims)) %>% lapply(function(name){
  list_df <- list('train' = sims[[name]],
                  'valid' = valid[[name]],
                  'test' = test[[name]])
  the_pdx_lightgbm_fun(list_data = list_df, name = name,
                       hyperparameter_grid = hyperparameter_grid)
})
Pdx for scenario:  Scenario1  
hyperparam  1 / 1 
Best valid binary logloss: 0.5821824  
optimal ntree: 350  
Training time: 18.22779  sec. 
Pdx for scenario:  Scenario2  
hyperparam  1 / 1 
Best valid binary logloss: 0.5787944  
optimal ntree: 508  
Training time: 25.75189  sec. 
Pdx for scenario:  Scenario3  
hyperparam  1 / 1 
Best valid binary logloss: 0.6335532  
optimal ntree: 571  
Training time: 27.79643  sec. 
Training the propensity function on experimental data (for later use in section 5)
# Define grid for hyperparameter optimization
hyperparameter_grid_sims <- expand.grid(
    learning_rate = c(0.01),
    bagging_fraction = c(0.75),
    max_depth = c(6),
    lambda_l1 = c(0.5),
    lambda_l2 = c(0.5)
  )

pdx_sims <- lapply(seq_along(sim_samples), function(idx){
    list_df <- list('train' = sim_samples[[idx]]$train,
                  'valid' = sim_samples[[idx]]$valid,
                  'test' = sim_samples[[idx]]$test)
  the_pdx_lightgbm_fun(list_data = list_df, name = paste0('sim', idx),
                       hyperparameter_grid = hyperparameter_grid_sims)
})

2.3 Aware premium

The aware premium, \(\widehat{\mu}^A(\mathbf{x})\), requires knowledge of the marginal distribution of \(D\). The aware price, a particular case of the ``discrimination-free’’ price of Lindholm et al. (2022), is computed as:
\[\begin{equation*} \widehat{\mu}^A(\mathbf{x}) = \sum_{d\in \mathcal{D}} \widehat{\mu}^B(\mathbf{x}, d) \widehat{\Pr}(D = d). \end{equation*}\] As discussed earlier, we assume \(D\) is discrete or discretized. If the training sample is representative of the target population (portfolio, market, or region?), empirical proportions are consistent estimators of \(\widehat{\Pr}(D = d)\). This is also an estimator suggested in Section 5 of Lindholm et al. (2022).

Computing the empirical proportions for protected supopulations
marg_dist <- sims %>% lapply(function(data){
  data$D %>% table %>%  prop.table
})

## for later use
saveRDS(marg_dist, 'preds/the_empirical_proportions.rds')

## Computing the empirical proportions for protected supopulations on experimental data (for later use in section 5)
marg_dist_sims <- seq_along(sim_samples) %>% lapply(function(idx){
  sim_samples[[idx]]$train$D %>% table %>%  prop.table
})
Note

This method generalizes the use of \(D\) as a control variable to prevent distortions in estimated effect of \(\mathbf{X}\) due to omitted variable bias. It applies to all model types and has a causal interpretation under the assumptions that: (i) possible values of \(\mathbf{X}\) are observed across all groups of \(D\) (no extrapolation), (ii) \(D\) causes both \(\mathbf{X}\) and \(Y\), and (iii) there are no relevant unobserved variables. Under the same assumptions, other methods are estimators of the aware premium. One example is inverse probability weighting, which re-weights observations to (artificially) remove the association between \(\mathbf{X}\) and \(D\), preventing proxy effects.

2.4 Corrective premium

We estimate the corrective premium, \(\widehat{\mu}^C(\mathbf{x}, d)\), designed to enforce strong demographic parity by aligning premium distributions across protected groups. Various methods can achieve this, but we recommend the optimal transport approach of Hu, Ratz, and Charpentier (2024). Its advantage lies in modifying \(\widehat{\mu^B}(\mathbf{x}, d)\) while keeping the overall spectrum estimation anchored to the initial best-estimate premium. As an incremental adjustment, it minimizes modeling complexity while enforcing demographic parity, yielding a simple estimator for the corrective premium.

The corrective premium is computed by training a transport function that shifts best-estimate premiums for each protected group toward a common barycenter, ensuring demographic parity through corrective direct discrimination. See Hu, Ratz, and Charpentier (2024) for details. This optimal transport method is implemented in Python via the Equipy package of Fernandes Machado et al. (2025). The following chunk illustrates how this method, despite its complexity, translates into concise Python code.

Listing 2.1: Illustrative Python code for wasserstein transport toward corrective premiums.
import numpy as np
import pandas as pd
from equipy.fairness import FairWasserstein

# Load best-estimate premiums and associated sensitive attributes
best_est_train = pd.read_csv('best_est_prem_train.csv')  # Training premiums
sens_train = pd.read_csv('sensitive_train.csv')  # discrete sensitive data

# Train Fair Wasserstein transport to adjust premiums
barycenter = FairWasserstein()
barycenter.fit(best_est.values, sens.values)

# Load best-estimate premiums and associated sensitive attributes
best_est_test = pd.read_csv('best_est_prem_test.csv')  # Training premiums
sens_test = pd.read_csv('sensitive_test.csv')  # discrete sensitive data

# Apply transformation to obtain fairness-adjusted premiums
corrective_premiums_test = barycenter.transform(best_est_test.values, sens_test.values)

# Save results
pd.DataFrame(corrective_premiums).to_csv('corr_prem_test.csv', index=False)
Optimal transport and Wasserstein distance : technical details

2.4.1 A statistical Introduction

2.4.1.1 Comparing two samples

Suppose that a quantity of interest \(z\) is observed in two groups, with two samples (of equal size to simplify notations), \(\{z_0^{(1)},z_0^{(2)},...,z_0^{(n)}\}\) and \(\{z_1^{(1)},z_1^{(2)},...,z_1^{(n)}\}\). How can you tell if these two groups are similar or different? A common classical approach is to compare their means. For instance, a t-test checks whether two samples have significantly different averages. However, this approach assumes something about the shape of the data, typically that both samples come from Gaussian (normal) distributions with similar variances. If the data are skewed, multimodal, or contain outliers, comparing means alone can be misleading.

2.4.1.2 Comparing two samples (nonparametric test)

When you don’t want to assume a specific distribution, you can use nonparametric tests such as the Kolmogorov–Smirnov (KS) test or the Mann–Whitney U test.These tests compare the empirical distributions of the samples directly, essentially, how their cumulative distribution functions (CDFs) differ. This gives a more flexible notion of “difference”, but it doesn’t produce a continuous distance that tells how far apart the samples are.

2.4.1.3 Comparing two samples with Wasserstein distance

Optimal transport provides exactly that: a way to measure the distance between distributions. Suppose also that the two samples are ordered, \(z_0^{(i-1)}\leq z_0^{(i)},...,z_0^{(i+1)}\}\) (and similarly in group \(1\)), then Wasserstein distance is

\[ \widehat W_p^p=\frac{1}{n}\sum_{i=1}^{n}\big| z_{(1)}^{(i)}-z_{(0)}^{(i)}\big|^p. \]

2.4.1.4 Wasserstein distance and Quantiles

If \(\hat F_{0,n}\) and \(\hat F_{1,n}\) denote the empirical cumulative distribution functions, \(z_{(1)}^{(i)}=\hat F_{1,n}^{-1}(i/n)\) and \(z_{(0)}^{(i)}=\hat F_{0,n}^{-1}(i/n)\), so that

\[ \widehat W_p^p=\frac{1}{n}\sum_{i=1}^{n}\big| \hat F_{1,n}^{-1}(i/n)-\hat F_{0,n}^{-1}(i/n)\big|^p. \] Thus, Wasserstein distance is a distance between quantile function \(F^{-1}\), that can be extended to the case where the two samples do not have the same size.

2.4.1.5 Wasserstein distance and optimal pairing

The pairing \(T_{0\to 1}^*:z_{(0)}^{(i)}\mapsto z_{(1)}^{(i)}\) is the optimal pairing, that solves the optimal transport problem

\[ T_{0\to 1}^*\in\mathrm{argmin}\sum_{i=1}^{n}\big| T(z_{(0)}^{(i)})-z_{(0)}^{(i)}\big|^p \]

where \(T\) takes values in \(\{z_1^{(1)},z_1^{(2)},...,z_1^{(n)}\}\), and \(T\) is a one-to-one pairing. From Hardy-Littlewood-Polya inequality, when \(p=2\), \(T_{0\to 1}^*\) is a comonotonic paring, and should preserve the ranks : the \(i\)-th ranked observations should be matched together. In a nutshell, it comes from the convexity of the cost function, since

\[ (z_{(1)}^{(i)}-z_{(0)}^{(i)})^2 + (z_{(1)}^{(j)}-z_{(0)}^{(j)})^2 \leq (z_{(1)}^{(i)}-z_{(0)}^{(j)})^2 + (z_{(1)}^{(j)}-z_{(0)}^{(i)})^2 \]

as soon as \(i\neq j\). Observe that \(T_{0\to 1}^*\) can also be written \(T_{0\to 1}^*(\cdot)=\hat F_{1,n}^{-1}(\hat F_{0,n}(\cdot))\)

2.4.1.6 Wasserstein barycenter

One can also consider a third sample, defined a \(\bar z^{(i)}=(z_{(0)}^{(i)}+z_{(1)}^{(i)})/2\). This is an ordered sample, in between the two. We can write

\[ \overline{z}^{(i)} = \frac{1}{2} z_{(0)}^{(i)} + \frac{1}{2} z_{(1)}^{(i)} = \frac{1}{2} z_{(0)}^{(i)} + \frac{1}{2} T_{0 \to 1}^* z_{(0)}^{(i)} = \frac{1}{2} T_{1 \to 0}^* z_{(1)}^{(i)} + \frac{1}{2} z_{(1)}^{(i)} \]

Assuming \(T_{1 \to 0}^*(\cdot) = \hat{F}_{0,n}^{-1}(\hat{F}_{1,n}(\cdot))\) and \(T_{0 \to 1}^*(\cdot) = \hat{F}_{1,n}^{-1}(\hat{F}_{0,n}(\cdot))\), both expressions yield the same barycentric sample.

2.5 Optimal transport and Wasserstein distance

All those concepts can be formalized more generally. Let \(Z\) be a quantity of interest. Let \(\nu_0\) and \(\nu_1\) be two laws of \(Z\) (finite \(p\)-th moments, \(p\ge1\)).

Probabilistic definition (pairing).

Over all joint laws with the right marginals,

\[ W_p(\nu_0,\nu_1) =\Big(\inf_{\,\mathcal L(Z^{(0)})=\nu_0,\ \mathcal L(Z^{(1)})=\nu_1}\ \mathbb E\big(\,\lVert Z^{(0)}-Z^{(1)}\rVert^p\,\big)\Big)^{1/p}. \]

Interpretation: the smallest average \(p\)-power gap under any admissible pairing.

Units: same as \(Z\); \(W_p=0\) iff \(\nu_0=\nu_1\).

Univariate formula (what we compute).

Let \(F\) and \(G\) be the CDFs of \(\nu_0\) and \(\nu_1\), with quantiles \(F^{-1}\) and \(G^{-1}\). Then

\[ W_p(\nu_0,\nu_1)=\left(\int_{0}^{1}\!\big|F^{-1}-G^{-1}\big|^p\,du\right)^{1/p}. \]

For \(p=1\),

\[ W_1(\nu_0,\nu_1)=\int_{0}^{1}\!\big|F^{-1}(u)-G^{-1}(u)\big|\,du \]

Interpretation: \(W_1\) is the average absolute gap between matched quantiles (interpretable in dollars if \(Z\) is a monetary amount).

Computing the optimal transport mapping for the scenarios
source_python("___opt_transp.py")

## now, the folder 'transported' exist, with one epsilon_y file per scenario

To predict corrective premium on other datasets, one can alternatively train a lightgbm model of mapping from \((X, D)\) to the resulting corrective premiums from Equipy on the training sample.

Computing the optimal transport mapping for the scenarios
source('___lgb_mapping_to_corr.R') 

corr_mapping_lgb <- setNames(nm = names(sims)) %>%
  lapply(the_mapping_to_corr_lightgbm_fun)
Mapping to corr. for scenario:  Scenario1  
Data import: 0.671685  sec. 
Data conversion: 0.02966404  sec. 
Lgb training : 40.27824  sec. 
Mapping to corr. for scenario:  Scenario2  
Data import: 0.6492171  sec. 
Data conversion: 0.02690792  sec. 
Lgb training : 18.2528  sec. 
Mapping to corr. for scenario:  Scenario3  
Data import: 0.6066391  sec. 
Data conversion: 0.02551007  sec. 
Lgb training : 34.17393  sec. 

2.6 Hyperaware premium

The hyperaware premium, \(\widehat{\mu}^H(\mathbf{x})\), is the best non-directly discriminatory approximation of the corrective premium. We construct a lightgbm using only \(\mathbf{X}\) to predict the corrective premium, aiming at demographic parity without direct reliance on \(D\):
\[\begin{equation*} \widehat{\mu}^H(\mathbf{x}) = E\{\widehat{\mu}^C(\mathbf{x}, D) | X = \mathbf{x}\}. \end{equation*}\]
Since the response variable is a premium, MSE is a correct choice of loss function in most cases. Just as for the unaware premium, one can explicitly aggregate corrective premiums across protected groups using estimated propensities as weights. This is what we do here.

2.7 Visualization the estimated spectrum

By combining all the trained models as component into a trained spectrum of fairness, we can define the premium function that will serve to predict the premiums

Defining the estimated premium and metrics functions
premiums <- setNames(nm = names(sims)) %>% lapply(function(name){
  premium_generator(best = best_lgb[[name]]$pred_fun, 
                  pdx = pdx_lgb[[name]]$pred_fun, 
                  maps_to_corr = corr_mapping_lgb[[name]], 
                  marg = marg_dist[[name]])
})

quants <- setNames(nm = names(sims)) %>% lapply(function(name){
  quant_generator(premiums = premiums[[name]])
})


## For experimental data (future use in section 5)
premiums_sims <- setNames(obj = seq_along(sim_samples),
                        nm = names(sim_samples)) %>% lapply(function(idx){
  premium_generator(best = best_sims[[idx]]$pred_fun, 
                  pdx = pdx_sims[[idx]]$pred_fun, 
                  maps_to_corr = corr_mapping_lgb[['Scenario1']], ## We put anything, won't be used anyway as we focus on proxy vulnerabiltiy for section 5. 
                  marg = marg_dist_sims[[idx]])
})

quants_sims <- setNames(obj = seq_along(sim_samples),
                        nm = names(sim_samples)) %>% lapply(function(idx){
  quant_generator(premiums = premiums_sims[[idx]])
})
Computation of estimated premiums and local metrics across the grid
df_to_g_file <- "preds/df_to_g.json"

# Check if the JSON file exists
if (file.exists(df_to_g_file)) {
  message(sprintf("[%s] File exists. Reading df_to_g from %s", Sys.time(), df_to_g_file))
  df_to_g <- fromJSON(df_to_g_file)
} else {

## From the first section of the online supplement
preds_grid_stats_theo <- fromJSON('preds/preds_grid_stats_theo.json')
  
df_to_g <- setNames(nm = names(sims)) %>% lapply(function(name) {
  message(sprintf("[%s] Processing: %s", Sys.time(), name))
  
  local_scenario_df <- preds_grid_stats_theo[[name]]
  
  # Start time for this scenario
  start_time <- Sys.time()
  
  # Step 1: Compute premiums
  message(sprintf("[%s] Step 1: Computing premiums", Sys.time()))
  premium_results <- setNames(nm = levels_for_premiums) %>%
    sapply(function(s) {
      message(sprintf("[%s] Computing premium: %s", Sys.time(), s))
      premiums[[name]][[s]](
        x1 = local_scenario_df$x1,
        x2 = local_scenario_df$x2,
        d = local_scenario_df$d
      )
    })
  
  # Step 2: Compute PDX
  message(sprintf("[%s] Step 2: Computing PDX", Sys.time()))
  pdx_results <- pdx_lgb[[name]]$pred_fun(
    data.frame(
      X1 = local_scenario_df$x1,
      X2 = local_scenario_df$x2,
      D = local_scenario_df$d
    )
  )
  
  # Combine results
  message(sprintf("[%s] Step 5: Combining results", Sys.time()))
  result <- data.frame(
    local_scenario_df,
    premium_results,
    pdx = pdx_results
  )
  
  # Log completion time
  end_time <- Sys.time()
  message(sprintf("[%s] Finished processing: %s (Duration: %.2f seconds)", end_time, name, as.numeric(difftime(end_time, start_time, units = "secs"))))
  
  return(result)
})

# Save the entire df_to_g object to JSON
  message(sprintf("[%s] Saving df_to_g to %s", Sys.time(), df_to_g_file))
  toJSON(df_to_g, pretty = TRUE, auto_unbox = TRUE) %>% write(df_to_g_file)
  rm(df_to_g_theo)
}

grid_stats_path <- 'preds/preds_grid_stats.json'

# Check and load or compute preds_grid_stats
if (file.exists(grid_stats_path)) {
  preds_grid_stats <- fromJSON(grid_stats_path)
} else {
  preds_grid_stats <- setNames(nm = names(df_to_g)) %>% 
    lapply(function(name) {
      data.frame(df_to_g[[name]], 
                 setNames(nm = levels_for_quants) %>% 
                       sapply(function(s) {
                         quants[[name]][[s]](x1 = df_to_g[[name]]$x1,
                                             x2 = df_to_g[[name]]$x2,
                                             d = df_to_g[[name]]$d)
                       }))
    })
  toJSON(preds_grid_stats, pretty = TRUE, auto_unbox = TRUE) %>% 
    write(grid_stats_path)
}
Computing estimated premiums and local metrics for the simulations
pop_to_g_file <- "preds/pop_to_g.json"

# Check if the JSON file exists
if (file.exists(pop_to_g_file)) {
  message(sprintf("[%s] File exists. Reading pop_to_g from %s", Sys.time(), pop_to_g_file))
  pop_to_g <- fromJSON(pop_to_g_file)
} else {
## From the first section of the online supplement
preds_pop_stats_theo <- fromJSON('preds/preds_pop_stats_theo.json')
  
pop_to_g <- setNames(nm = names(sims)) %>% lapply(function(name) {
  message(sprintf("[%s] Processing: %s", Sys.time(), name))
  
  # Start time for this simulation
  start_time <- Sys.time()
  
  list_data <- list('train' = sims[[name]],
                    'valid' = valid[[name]],
                    'test' = test[[name]])
  
  result <- setNames(nm = names(list_data)) %>% lapply(function(nm){
    
    data <- list_data[[nm]]
    
    # Step 1: Compute premiums
  message(sprintf("[%s] Step 1: Computing premiums", Sys.time()))
  premium_results <- setNames(nm = levels_for_premiums) %>%
    sapply(function(s) {
      message(sprintf("[%s] Computing premium: %s", Sys.time(), s))
      premiums[[name]][[s]](
        x1 = data$X1,
        x2 = data$X2,
        d = data$D
      )
    })
  
  # Step 2: Compute PDX
  message(sprintf("[%s] Step 2: Computing PDX", Sys.time()))
  pdx_results <- pdx_lgb[[name]]$pred_fun(
    data.frame(
      X1 = data$X1,
      X2 = data$X2,
      D = data$D
    )
  )
  
  # Combine results
  message(sprintf("[%s] Step 5: Combining results", Sys.time()))
  data.frame(
    preds_pop_stats_theo[[name]][[nm]],
    premium_results,
    pdx = pdx_results
  )
  }) 
  
  
  # Log completion time
  end_time <- Sys.time()
  message(sprintf("[%s] Finished processing: %s (Duration: %.2f seconds)", end_time, name, as.numeric(difftime(end_time, start_time, units = "secs"))))
  
  return(result)
})

# Save the entire df_to_g object to JSON
  message(sprintf("[%s] Saving pop_to_g to %s", Sys.time(), pop_to_g_file))
  toJSON(pop_to_g, pretty = TRUE, auto_unbox = TRUE) %>% write(pop_to_g_file)
  
  rm(pop_to_g_theo)
}


pop_stats_path <- 'preds/preds_pop_stats.json'

# Check and load or compute preds_pop_stats
if (file.exists(pop_stats_path)) {
  preds_pop_stats <- fromJSON(pop_stats_path)
} else {
  preds_pop_stats <- setNames(nm = names(sims)) %>% 
    lapply(function(name) {
      setNames(nm = names(pop_to_g[[name]])) %>%  
        lapply(function(set) {
          local_df <- pop_to_g[[name]][[set]]
          
          data.frame(local_df, 
                 setNames(nm = levels_for_quants) %>% 
                       sapply(function(s) {
                         quants[[name]][[s]](x1 = local_df$X1,
                                             x2 = local_df$X2,
                                             d =  local_df$D)
                       }))
        })
    })
  toJSON(preds_pop_stats, pretty = TRUE, auto_unbox = TRUE) %>% 
    write(pop_stats_path)
}
Computing estimated premiums and local metrics for the experiment
sims_to_g_file <- "preds/sims_to_g.json"

# Check if the JSON file exists
if (file.exists(sims_to_g_file)) {
  message(sprintf("[%s] File exists. Reading sims_to_g from %s", Sys.time(), sims_to_g_file))
  sims_to_g <- fromJSON(sims_to_g_file)
} else {
## From the first section of the online supplement
preds_sims_stats_theo <- fromJSON('preds/preds_sims_stats_theo.json')
sims_to_g <- setNames(object = seq_along(sim_samples), 
                      nm = names(sim_samples)) %>% lapply(function(idx) {
  message(sprintf("[%s] Processing: %s", Sys.time(), paste0(idx, '/', length(sim_samples))))
  
  samples_to_ret <- setNames(nm = names(sim_samples[[idx]])) %>% lapply(function(nm){
  data <- preds_sims_stats_theo[[idx]][[nm]]
    
    # Step 1: Compute premiums
  message(sprintf("[%s] Step 1: Computing premiums", Sys.time()))
  premium_results <- setNames(nm = levels_for_premiums) %>%
    sapply(function(s) {
      message(sprintf("[%s] Computing premium: %s", Sys.time(), s))
      premiums_sims[[idx]][[s]](
        x1 = data$X1,
        x2 = data$X2,
        d = data$D
      )
    })
  
  # Step 2: Compute PDX
  message(sprintf("[%s] Step 2: Computing PDX", Sys.time()))
  pdx_results <- pdx_sims[[idx]]$pred_fun(
    data.frame(
      X1 = data$X1,
      X2 = data$X2,
      D = data$D
    )
  )
  
  
  # Combine results
  message(sprintf("[%s] Step 5: Combining results", Sys.time()))
  result <- data.frame(
    data,
    premium_results,
    pdx = pdx_results
  )
    
  })
  
  return(samples_to_ret)
})

# Save the entire df_to_g object to JSON
  message(sprintf("[%s] Saving sims_to_g to %s", Sys.time(), sims_to_g_file))
  toJSON(sims_to_g, pretty = TRUE, auto_unbox = TRUE) %>% write(sims_to_g_file)
}

sims_stats_path <- 'preds/preds_sims_stats.json'

# Check and load or compute preds_pop_stats
if (file.exists(sims_stats_path)) {
  preds_sims_stats <- fromJSON(sims_stats_path)
} else {
  preds_sims_stats <- setNames(nm = names(sims_to_g)) %>% 
    lapply(function(name) {
      setNames(nm = names(sims_to_g[[name]])) %>%  
        lapply(function(set) {
          local_df <- sims_to_g[[name]][[set]]
          
          data.frame(local_df, 
                 setNames(nm = levels_for_quants) %>% 
                       sapply(function(s) {
                         quants_sims[[name]][[s]](x1 = local_df$X1,
                                             x2 = local_df$X2,
                                             d =  local_df$D)
                       }))
        })
    })
  toJSON(preds_sims_stats, pretty = TRUE, auto_unbox = TRUE) %>% 
    write(sims_stats_path)
}
R code producing the estimated propensity illustration across scenarios
to_save_pdx_perpop <- names(df_to_g) %>% 
  lapply(function(name){
    cols <- the_CAS_colors
    pop_id <- which(names(df_to_g) == name)
    
    ## keep only axis for last plot
    if(pop_id == 1){ # If it's the last, apply correct xlabels
      the_y_scale <- ylim(0,1)
      the_y_label <- latex2exp::TeX("$\\widehat{P}(D = 1|x_1, x_2)$")
    } else { # otherwise, remove everything
      the_y_scale <- scale_y_continuous(labels = NULL, limits = c(0,1))
      the_y_label <- NULL
    }
    
    the_pops <- c('Scenario 1', 'Scenario 2', 'Scenario 3')
    
    ## lets graph
    df_to_g[[name]] %>% 
      mutate(the_population = name) %>% 
  filter(x1 >= -9, x1 <= 11,
         d == 1) %>% 
  group_by(x1, x2) %>% summarise(pdx = mean(pdx)) %>%  ungroup %>% 
  ggplot(aes(x = x1, y = pdx,
             lty = factor(x2),
             linewidth = factor(x2),
             shape = factor(x2),
             alpha = factor(x2),
             color = factor(x2))) +
  geom_line() +
  scale_linetype_manual(values = c('solid', '31', '21', '11'), name = latex2exp::TeX('$x_2$')) +
  scale_color_manual(values = cols, name = latex2exp::TeX('$x_2$')) +
  scale_linewidth_manual(values = c(2, 1, 0.85, 0.55), name = latex2exp::TeX('$x_2$')) +  
  scale_alpha_manual(values = c(0.65, 0.75, 0.85, 0.9), name = latex2exp::TeX('$x_2$')) + 
  labs(x = latex2exp::TeX("$x_1$"),
       y = the_y_label,
       title = latex2exp::TeX(the_pops[pop_id])) + 
  scale_x_continuous(breaks = c(-3:3)*3 + 1)  + # see above
  theme_minimal() + 
  the_y_scale +
  theme(plot.title = element_text(hjust=0.5))
  }) %>% ggpubr::ggarrange(plotlist = .,
                           nrow = 1,
                           widths = 15, heights = 1,
                           common.legend = T,
                           legend = 'right')

if (!dir.exists('figs')) dir.create('figs')
ggsave(filename = "figs/graph_pdx_perpop.png",
       plot = to_save_pdx_perpop,
       height = 3.25,
       width = 7.55,
       units = "in",
       device = "png", dpi = 500)
Figure 2.1: Estimated propensity in terms of \(x_1\) and \(x_2\) for simulations
R code producing the estimated best-estimate, unaware, and aware illustration.
to_save_premiumsdense_BUA_perpop <- names(df_to_g) %>% lapply(function(pop_name){
  
  ## the colors
    cols <- RColorBrewer::brewer.pal(5, 'Spectral')[1:3] %>%  colorspace::darken(0.25)
      # the_CAS_colors_full[c(6, 5, 2)]
  pop_id <- which(names(df_to_g) == pop_name)
  
  ## keep only axis for last plot
    if(pop_name == head(names(df_to_g), 1)){ # If it's the last, apply correct xlabels
      the_y_scale <- scale_y_continuous(breaks = c(90, 110, 130),
                     labels = scales::dollar,
                     limits = c(90, 140))
      the_y_label <- 
  latex2exp::TeX("$\\widehat{\\mu}(x_1, x_2, d)$")
    } else { # otherwise, remove everything
      the_y_scale <- scale_y_continuous(breaks = c(90, 110, 130),
                     labels = NULL,
                     limits = c(90, 140))
      the_y_label <- NULL
    }
  
  to_illustrate <- df_to_g[[pop_name]] %>%
  reshape2::melt(measure.vars = paste0(levels_for_premiums[1:3])) %>% 
    mutate(variable = factor(variable,
                             levels = paste0(levels_for_premiums[1:3]),
                             labels = labels_for_premiums[1:3])) %>% 
  filter(x1 <= 10, x1 >= -8) 
  
  to_illustrate$value[to_illustrate$pdx < 0.1] <- NA
  
  to_illustrate %>% 
  ggplot(aes(x = x1, y = value,
             lty = factor(d),
             linewidth = factor(x2),
             shape = factor(x2),
             alpha = factor(d),
             color = factor(variable))) +
  geom_line() +
  scale_linetype_manual(values = c('solid', '32'), name = latex2exp::TeX('$d$')) +
  scale_color_manual(values = cols, name = latex2exp::TeX('$\\mu$'), labels = latex2exp::TeX) +
  scale_linewidth_manual(values = c(1.5, 1, 0.85, 0.55), name = latex2exp::TeX('$x_2$')) +  
  scale_alpha_manual(values = c(0.35, 0.7), name = latex2exp::TeX('$d$')) + 
  labs(x = latex2exp::TeX("$x_1$"), y = the_y_label,
       title = paste0('Scenario ', pop_id)) +
  scale_x_continuous(breaks = c( -3, 0, 3, 6), limits = c(-4, 7)) +
  the_y_scale +
  theme_minimal()
}) %>% ggpubr::ggarrange(plotlist = .,
                           ncol = 3,
                           widths = c(18, 15, 15), heights = 1,
                           common.legend = T,
                           legend = 'right')

ggsave(filename = "figs/graph_premiumsdense_BUA_perpop.png",
       plot = to_save_premiumsdense_BUA_perpop,
       height = 4,
       width = 10.55,
       units = "in",
       device = "png", dpi = 500)
Figure 2.2: Estimated best-estimate \(\widehat{\mu}^B\), unaware \(\widehat{\mu}^U\), and aware \(\widehat{\mu}^A\) premiums for scenarios 1, 2, and 3 in the Example as a function of \(x_1\), \(x_2\), and \(d\).
R code producing the estimated aware, hyperaware, corrective illustration.
to_save_premiumsdense_AHC_perpop <- names(df_to_g) %>% lapply(function(pop_name){
  
  ## the colors
    cols <- RColorBrewer::brewer.pal(5, 'Spectral')[3:5] %>%  colorspace::darken(0.25)
  pop_id <- which(names(df_to_g) == pop_name)
  
  ## keep only axis for last plot
    if(pop_name == head(names(df_to_g), 1)){ # If it's the last, apply correct xlabels
      the_y_scale <- scale_y_continuous(breaks = c(90, 110, 130),
                     labels = scales::dollar,
                     limits = c(90, 130))
      the_y_label <- 
  latex2exp::TeX("$\\widehat{\\mu}(x_1, x_2, d)$")
    } else { # otherwise, remove everything
      the_y_scale <- scale_y_continuous(breaks = c(90, 110, 130),
                     labels = NULL,
                     limits = c(90, 130))
      the_y_label <- NULL
    }
  
  to_illustrate <- df_to_g[[pop_name]] %>%
  reshape2::melt(measure.vars = paste0(levels_for_premiums[3:5])) %>% 
    mutate(variable = factor(variable,
                             levels = paste0(levels_for_premiums[3:5]),
                             labels = labels_for_premiums[3:5])) %>% 
  filter(x1 <= 10, x1 >= -8) 
  # group_by(x1, d, variable) %>% summarise(value = mean(value)) %>%  ungroup %>% 
  
  ## mask part of the line 
  to_illustrate$value[to_illustrate$pdx < 0.1] <- NA
  
  to_illustrate %>% 
  ggplot(aes(x = x1, y = value,
             lty = factor(d),
             linewidth = factor(x2),
             shape = factor(x2),
             alpha = factor(d),
             color = factor(variable))) +
  geom_line() +
  scale_linetype_manual(values = c('solid', '32'), name = latex2exp::TeX('$d$')) +
  scale_color_manual(values = cols, name = latex2exp::TeX('$\\mu$'), labels = latex2exp::TeX) +
  scale_linewidth_manual(values = c(1.5, 1, 0.85, 0.55), name = latex2exp::TeX('$x_2$')) +  
  scale_alpha_manual(values = c(0.35, 0.7), name = latex2exp::TeX('$d$')) + 
  labs(x = latex2exp::TeX("$x_1$"), y = the_y_label,
       title = paste0('Scenario ', pop_id)) +
  scale_x_continuous(breaks = c( -3, 0, 3, 6), limits = c(-4, 7)) +
  the_y_scale +
  theme_minimal()
}) %>% ggpubr::ggarrange(plotlist = .,
                           ncol = 3,
                           widths = c(18, 15, 15),
                            heights = 1,
                           common.legend = T,
                           legend = 'right')

ggsave(filename = "figs/graph_premiumsdense_AHC_perpop.png",
       plot = to_save_premiumsdense_AHC_perpop,
       height = 4,
       width = 10.55,
       units = "in",
       device = "png", dpi = 500)
Figure 2.3: Estimated aware \(\widehat{\mu}^A\), hyperaware \(\widehat{\mu}^H\), and corrective \(\widehat{\mu}^C\) premiums for scenarios 1, 2, and 3 in the Example as a function of \(x_1\), \(x_2\), and \(d\).
History
Loading…